{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/dumping-twitter-6-july-2019.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-02-22-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-08-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-03-28-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-12-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-04-22-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-02-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-11-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-31-twitter-dump-in.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-03-06-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-04-21-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2021-06-06-twitter.tar\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/compiled-2022-06-08-twitter.tar\n",
    "\n",
    "# !tar -xf compiled-2021-03-06-twitter.tar\n",
    "# !tar -xf compiled-2021-04-21-twitter.tar\n",
    "# !tar -xf compiled-2021-06-06-twitter.tar\n",
    "# !tar -xf compiled-2022-06-08-twitter.tar\n",
    "# !rm *.tar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/2020-05-02-twitter-dump-en.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/mesolitica/fasttext-language-detection-v3/resolve/main/fasttext.ftz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "import fasttext\n",
    "\n",
    "model = fasttext.load_model('fasttext.ftz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from malaya.text.function import (\n",
    "    simple_textcleaning,\n",
    "    language_detection_textcleaning,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "bahasa = ['dumping-twitter-6-july-2019.json']\n",
    "bahasa.extend(glob('*twitter-dump-in.json'))\n",
    "bahasa.extend(glob('compiled/*.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "500000it [00:28, 17814.45it/s]\n",
      "500000it [00:29, 16875.09it/s]\n",
      "500000it [00:28, 17461.18it/s]\n",
      "500000it [00:30, 16595.75it/s]\n",
      "500000it [00:29, 16678.71it/s]\n",
      "75025it [00:04, 16814.56it/s]\n",
      "500000it [00:29, 16840.28it/s]\n",
      "500000it [00:29, 17018.94it/s]\n",
      "500000it [00:29, 17025.86it/s]\n",
      "500000it [00:28, 17323.11it/s]\n",
      "500000it [00:28, 17437.36it/s]\n",
      "500000it [00:29, 17240.99it/s]\n",
      "500000it [00:29, 17078.78it/s]\n",
      "500000it [00:28, 17342.76it/s]\n",
      "500000it [00:28, 17313.71it/s]\n"
     ]
    }
   ],
   "source": [
    "texts = []\n",
    "for f in glob('/home/husein/ssd3/twitter/*.splitted'):\n",
    "    with open(f) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            t = data['data_text']\n",
    "            l = model.predict(language_detection_textcleaning(t))[0][0]\n",
    "            if l not in {'__label__local-malay', '__label__standard-malay'}:\n",
    "                continue\n",
    "            texts.append(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████| 6597867/6597867 [01:09<00:00, 94293.38it/s]\n",
      "100%|██████████████████████████████| 1673772/1673772 [00:20<00:00, 81862.47it/s]\n",
      "100%|██████████████████████████████| 1121928/1121928 [00:13<00:00, 82730.54it/s]\n",
      "100%|██████████████████████████████| 1134736/1134736 [00:13<00:00, 81484.96it/s]\n",
      "100%|██████████████████████████████| 1175497/1175497 [00:14<00:00, 83465.79it/s]\n",
      "100%|██████████████████████████████| 2131906/2131906 [00:25<00:00, 82481.73it/s]\n",
      "100%|██████████████████████████████| 1250595/1250595 [00:15<00:00, 79486.97it/s]\n",
      "100%|██████████████████████████████| 1617795/1617795 [00:20<00:00, 78060.22it/s]\n",
      "100%|██████████████████████████████| 1711555/1711555 [00:21<00:00, 78220.19it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73788.60it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73656.98it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71129.02it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71842.70it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74468.63it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73831.54it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76627.79it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 70629.99it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76160.37it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71442.46it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76314.92it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76688.75it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 72175.73it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71064.92it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 72456.85it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74809.86it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76142.56it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 75220.34it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71446.67it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 72608.10it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71016.93it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 71377.10it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73919.88it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73364.39it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 70642.62it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73017.17it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74676.44it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74752.50it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73045.98it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74320.92it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 75053.82it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 77289.38it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 79608.34it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73411.65it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 75062.44it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73645.55it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 72867.86it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 73901.32it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 70783.19it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 74898.27it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 76691.89it/s]\n",
      "100%|████████████████████████████████| 100000/100000 [00:01<00:00, 78549.74it/s]\n"
     ]
    }
   ],
   "source": [
    "for f in bahasa:\n",
    "    with open(f) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    for d in tqdm(data):\n",
    "        if isinstance(d, dict):\n",
    "            t = d['data_text']\n",
    "        else:\n",
    "            t = d\n",
    "\n",
    "        l = model.predict(language_detection_textcleaning(t))[0][0]\n",
    "        if l not in {'__label__local-malay', '__label__standard-malay'}:\n",
    "            continue\n",
    "        \n",
    "        texts.append(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19097572"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = list(set(texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12711645"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['@skutereletrik @asna_wey @ewanzp @HaHZaff Aku ingtkn tanda 2 kt dahi tu kncing manis',\n",
       " 'Akk aku kecoh giler nk suruh adik dia kawin  https://t.co/DHCxvMfsbS',\n",
       " '@rxalkal Itu ava km.ikan cupang',\n",
       " '@nubuvi 30 sen gano syat wt tk ingat',\n",
       " '@emirgentel Aku dah kahwin anak dh 2 baru tahu sape yang baling ..boleh lah nanti aku cite kt anak aku thks bro',\n",
       " 'Intai2 kat dalam taska.. kesian tengok anak aku duduk kat sudut termenung sorang2 lepas pengasuh letak dia dalam bi https://t.co/R5FifOVmO5',\n",
       " '@brgsjks Nep nak parking dekat belah depan. HCO B dekat dengan HCO A. Tapi srs laju datang 10 minit sebelum appointment.',\n",
       " '@NadyaKamidan Baik la sayang ',\n",
       " '@nurainaisyahh hahahahahaha weh bukan selalu tidur lambat sejak wfh ni',\n",
       " 'RT PDRMsia: POSTING PILIHAN: PEMANDU POTONG BARISAN SECARA BAHAYA DISAMAN\\n\\nBacaan lanjut di pautan seperti berikut, https://t.co/oDMlPzCHIA']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 12711645/12711645 [00:09<00:00, 1314690.71it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('dedup-twitter.jsonl', 'w') as fopen:\n",
    "    for t in tqdm(texts):\n",
    "        fopen.write(f'{json.dumps(t)}\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
